In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, IncrementalPCA, FastICA
plt.style.use("ggplot")
%matplotlib inline
In [2]:
train = pd.read_csv("data/train.csv", index_col="ID")
test = pd.read_csv("data/test.csv", index_col="ID")
In [3]:
train.head()
Out[3]:
In [16]:
train.dtypes.value_counts()
Out[16]:
In [17]:
# X0-X8 are alphanumeric
train.iloc[:, 1:12].head()
Out[17]:
In [37]:
train.y.describe()
Out[37]:
In [19]:
train.X0.value_counts()
Out[19]:
In [20]:
train.X1.value_counts()
Out[20]:
In [21]:
train.X2.value_counts()
Out[21]:
In [22]:
train.X3.value_counts()
Out[22]:
In [23]:
train.X4.value_counts()
Out[23]:
In [24]:
train.X5.value_counts()
Out[24]:
In [25]:
train.X6.value_counts()
Out[25]:
In [30]:
# X7 is missing
train.X8.value_counts()
Out[30]:
In [10]:
set(train.X1.values) == set(test.X1.values)
Out[10]:
In [11]:
set(train.X2.values) == set(test.X2.values)
Out[11]:
In [14]:
set(train.X2.values), set(test.X2.values)
Out[14]:
In [15]:
set(train.X3.values) == set(test.X3.values)
Out[15]:
In [16]:
set(train.X4.values) == set(test.X4.values)
Out[16]:
In [17]:
set(train.X5.values) == set(test.X5.values)
Out[17]:
In [20]:
set(train.X5.values), set(test.X5.values)
Out[20]:
In [18]:
set(train.X6.values) == set(test.X6.values)
Out[18]:
In [23]:
set(train.X8.values) == set(test.X8.values)
Out[23]:
In [6]:
train["data"] = "train"
test["data"] = "test"
combined_data = pd.concat([train, test])
encoded = pd.get_dummies(combined_data[["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]])
drop_cat = combined_data.drop(["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"], axis=1)
combined_data_clean = drop_cat.join(encoded)
train_data = combined_data_clean[combined_data_clean.data == "train"].copy()
test_data = combined_data_clean[combined_data_clean.data == "test"].copy()
train_data.drop("data", axis=1, inplace=True)
test_data.drop(["data", "y"], axis=1, inplace=True)
In [9]:
y_train = train_data["y"].astype(np.float32)
x_train = train_data.drop("y", axis=1).astype(np.float32)
x_test = test_data.astype(np.float32)
In [12]:
x_train.shape
Out[12]:
In [14]:
n_comp = 128
pca = PCA(n_components=n_comp, random_state=42)
pca_train = pca.fit(x_train)
In [17]:
sum(pca_train.explained_variance_ratio_)
Out[17]:
In [21]:
explained_variance = []
for i in range(0, 256, 8):
pca = PCA(n_components=i, random_state=42)
pca_train = pca.fit(x_train)
explained_variance.append((i, sum(pca_train.explained_variance_ratio_)))
In [22]:
dict(explained_variance)
Out[22]:
In [25]:
explained_variance = []
for i in range(240, 400, 8):
pca = PCA(n_components=i, random_state=42)
pca_train = pca.fit(x_train)
explained_variance.append((i, sum(pca_train.explained_variance_ratio_)))
In [26]:
# PCA: 96 - 90%, 128 - 94%, 144 - 95%, 384 - 99%
explained_variance
Out[26]:
In [28]:
n_comp = 128
ipca = IncrementalPCA(n_components=n_comp)
ipca_train = ipca.fit(x_train)
In [32]:
sum(ipca_train.explained_variance_ratio_)
Out[32]:
In [35]:
explained_variance = []
for i in range(1, 256, 8):
ipca = IncrementalPCA(n_components=i)
ipca_train = ipca.fit(x_train)
explained_variance.append((i, sum(ipca_train.explained_variance_ratio_)))
In [36]:
explained_variance
Out[36]:
In [37]:
explained_variance = []
for i in range(240, 400, 8):
ipca = IncrementalPCA(n_components=i)
ipca_train = ipca.fit(x_train)
explained_variance.append((i, sum(ipca_train.explained_variance_ratio_)))
In [38]:
explained_variance
Out[38]:
In [42]:
n_comp = 128
ica = FastICA(n_components=n_comp, random_state=42)
ica_train = ica.fit_transform(x_train)
In [43]:
ica_train
Out[43]:
In [4]:
train.y.describe()
Out[4]:
In [5]:
train.y.plot()
Out[5]:
In [6]:
train.y.hist()
Out[6]:
In [ ]:
In [ ]:
In [ ]: